JASON STOPAS
MODULE 04: LAB 01

In [1]:
import gdown
import polars as pl
import pandas as pd
import numpy as np
import sqlite3
import plotly.express as px
import plotly.graph_objects as go
In [2]:
drive_loc = 'https://drive.google.com/uc?id=1V2GCHGt2dkFGqVBeoUFckU4IhUgk4ocQ'

download_LC = 'downloaded_file.csv'
gdown.download(drive_loc, download_LC, quiet=False)

Jobs = pl.read_csv(download_LC)
Downloading...
From (original): https://drive.google.com/uc?id=1V2GCHGt2dkFGqVBeoUFckU4IhUgk4ocQ
From (redirected): https://drive.google.com/uc?id=1V2GCHGt2dkFGqVBeoUFckU4IhUgk4ocQ&confirm=t&uuid=d2c62f77-a3a3-4c81-bb4c-ea5c60293206
To: C:\Users\jtsto\OneDrive\Documents\03 BU SPRING 2025\AD 688\Mod04_Lab01\downloaded_file.csv
100%|██████████| 717M/717M [01:06<00:00, 10.8MB/s] 
In [3]:
Jobs.head()
Out[3]:
shape: (5, 131)
IDLAST_UPDATED_DATELAST_UPDATED_TIMESTAMPDUPLICATESPOSTEDEXPIREDDURATIONSOURCE_TYPESSOURCESURLACTIVE_URLSACTIVE_SOURCES_INFOTITLE_RAWBODYMODELED_EXPIREDMODELED_DURATIONCOMPANYCOMPANY_NAMECOMPANY_RAWCOMPANY_IS_STAFFINGEDUCATION_LEVELSEDUCATION_LEVELS_NAMEMIN_EDULEVELSMIN_EDULEVELS_NAMEMAX_EDULEVELSMAX_EDULEVELS_NAMEEMPLOYMENT_TYPEEMPLOYMENT_TYPE_NAMEMIN_YEARS_EXPERIENCEMAX_YEARS_EXPERIENCEIS_INTERNSHIPSALARYREMOTE_TYPEREMOTE_TYPE_NAMEORIGINAL_PAY_PERIODSALARY_TOSALARY_FROM…SOC_2021_5_NAMELOT_CAREER_AREALOT_CAREER_AREA_NAMELOT_OCCUPATIONLOT_OCCUPATION_NAMELOT_SPECIALIZED_OCCUPATIONLOT_SPECIALIZED_OCCUPATION_NAMELOT_OCCUPATION_GROUPLOT_OCCUPATION_GROUP_NAMELOT_V6_SPECIALIZED_OCCUPATIONLOT_V6_SPECIALIZED_OCCUPATION_NAMELOT_V6_OCCUPATIONLOT_V6_OCCUPATION_NAMELOT_V6_OCCUPATION_GROUPLOT_V6_OCCUPATION_GROUP_NAMELOT_V6_CAREER_AREALOT_V6_CAREER_AREA_NAMESOC_2SOC_2_NAMESOC_3SOC_3_NAMESOC_4SOC_4_NAMESOC_5SOC_5_NAMELIGHTCAST_SECTORSLIGHTCAST_SECTORS_NAMENAICS_2022_2NAICS_2022_2_NAMENAICS_2022_3NAICS_2022_3_NAMENAICS_2022_4NAICS_2022_4_NAMENAICS_2022_5NAICS_2022_5_NAMENAICS_2022_6NAICS_2022_6_NAME
strstrstri64strstri64strstrstrstrstrstrstrstri64i64strstrboolstrstri64stri64stri64stri64i64booli64i64strstri64i64…stri64stri64stri64stri64stri64stri64stri64stri64strstrstrstrstrstrstrstrstrstrstri64stri64stri64stri64stri64str
"1f57d95acf4dc67ed2819eb12f049f…"9/6/2024""2024-09-06 20:32:57.352 Z"0"6/2/2024""6/8/2024"6"[   "Company" ]""[   "brassring.com" ]""[   "https://sjobs.brassring.c…"[]"null"Enterprise Analyst (II-III)""31-May-2024 Enterprise Analys…"6/8/2024"6894731"Murphy USA""Murphy USA"false"[   2 ]""[   "Bachelor's degree" ]"2"Bachelor's degree"nullnull1"Full-time (> 32 hours)"22falsenull0"[None]"nullnullnull…"Data Scientists"23"Information Technology and Com…231010"Business Intelligence Analyst"23101011"General ERP Analyst / Consulta…2310"Business Intelligence"23101011"General ERP Analyst / Consulta…231010"Business Intelligence Analyst"2310"Business Intelligence"23"Information Technology and Com…"15-0000""Computer and Mathematical Occu…"15-2000""Mathematical Science Occupatio…"15-2050""Data Scientists""15-2051""Data Scientists""[   7 ]""[   "Artificial Intelligence" …44"Retail Trade"441"Motor Vehicle and Parts Dealer…4413"Automotive Parts, Accessories,…44133"Automotive Parts and Accessori…441330"Automotive Parts and Accessori…
"0cb072af26757b6c4ea9464472a50a…"8/2/2024""2024-08-02 17:08:58.838 Z"0"6/2/2024""8/1/2024"null"[   "Job Board" ]""[   "maine.gov" ]""[   "https://joblink.maine.gov…"[]"null"Oracle Consultant - Reports (3…"Oracle Consultant - Reports (3…"8/1/2024"null133098"Smx Corporation Limited""SMX"true"[   99 ]""[   "No Education Listed" ]"99"No Education Listed"nullnull1"Full-time (> 32 hours)"33falsenull1"Remote"nullnullnull…"Data Scientists"23"Information Technology and Com…231010"Business Intelligence Analyst"23101012"Oracle Consultant / Analyst"2310"Business Intelligence"23101012"Oracle Consultant / Analyst"231010"Business Intelligence Analyst"2310"Business Intelligence"23"Information Technology and Com…"15-0000""Computer and Mathematical Occu…"15-2000""Mathematical Science Occupatio…"15-2050""Data Scientists""15-2051""Data Scientists"nullnull56"Administrative and Support and…561"Administrative and Support Ser…5613"Employment Services"56132"Temporary Help Services"561320"Temporary Help Services"
"85318b12b3331fa490d32ad014379d…"9/6/2024""2024-09-06 20:32:57.352 Z"1"6/2/2024""7/7/2024"35"[   "Job Board" ]""[   "dejobs.org" ]""[   "https://dejobs.org/dallas…"[]"null"Data Analyst""Taking care of people is at th…"6/10/2024"839063746"Sedgwick""Sedgwick"false"[   2 ]""[   "Bachelor's degree" ]"2"Bachelor's degree"nullnull1"Full-time (> 32 hours)"5nullfalsenull0"[None]"nullnullnull…"Data Scientists"23"Information Technology and Com…231113"Data / Data Mining Analyst"23111310"Data Analyst"2311"Data Analysis and Mathematics"23111310"Data Analyst"231113"Data / Data Mining Analyst"2311"Data Analysis and Mathematics"23"Information Technology and Com…"15-0000""Computer and Mathematical Occu…"15-2000""Mathematical Science Occupatio…"15-2050""Data Scientists""15-2051""Data Scientists"nullnull52"Finance and Insurance"524"Insurance Carriers and Related…5242"Agencies, Brokerages, and Othe…52429"Other Insurance Related Activi…524291"Claims Adjusting"
"1b5c3941e54a1889ef4f8ae55b401a…"9/6/2024""2024-09-06 20:32:57.352 Z"1"6/2/2024""7/20/2024"48"[   "Job Board" ]""[   "disabledperson.com",   "d…"[   "https://www.disabledperso…"[]"null"Sr. Lead Data Mgmt. Analyst - …"About this role: Wells Fargo …"6/12/2024"1037615159"Wells Fargo""Wells Fargo"false"[   99 ]""[   "No Education Listed" ]"99"No Education Listed"nullnull1"Full-time (> 32 hours)"3nullfalsenull0"[None]"nullnullnull…"Data Scientists"23"Information Technology and Com…231113"Data / Data Mining Analyst"23111310"Data Analyst"2311"Data Analysis and Mathematics"23111310"Data Analyst"231113"Data / Data Mining Analyst"2311"Data Analysis and Mathematics"23"Information Technology and Com…"15-0000""Computer and Mathematical Occu…"15-2000""Mathematical Science Occupatio…"15-2050""Data Scientists""15-2051""Data Scientists""[   6 ]""[   "Data Privacy/Protection" …52"Finance and Insurance"522"Credit Intermediation and Rela…5221"Depository Credit Intermediati…52211"Commercial Banking"522110"Commercial Banking"
"cb5ca25f02bdf25c13edfede793150…"6/19/2024""2024-06-19 07:00:00.000 Z"0"6/2/2024""6/17/2024"15"[   "FreeJobBoard" ]""[   "craigslist.org" ]""[   "https://modesto.craigslis…"[]"null"Comisiones de $1000 - $3000 po…"Comisiones de $1000 - $3000 po…"6/17/2024"150"Unclassified""LH/GM"false"[   99 ]""[   "No Education Listed" ]"99"No Education Listed"nullnull3"Part-time / full-time"nullnullfalse925000"[None]""year"15000035000…"Data Scientists"23"Information Technology and Com…231010"Business Intelligence Analyst"23101012"Oracle Consultant / Analyst"2310"Business Intelligence"23101012"Oracle Consultant / Analyst"231010"Business Intelligence Analyst"2310"Business Intelligence"23"Information Technology and Com…"15-0000""Computer and Mathematical Occu…"15-2000""Mathematical Science Occupatio…"15-2050""Data Scientists""15-2051""Data Scientists"nullnull99"Unclassified Industry"999"Unclassified Industry"9999"Unclassified Industry"99999"Unclassified Industry"999999"Unclassified Industry"
In [4]:
Jobs.shape
Out[4]:
(72498, 131)
In [5]:
columns = Jobs.columns

for col in columns:
    print(col)
ID
LAST_UPDATED_DATE
LAST_UPDATED_TIMESTAMP
DUPLICATES
POSTED
EXPIRED
DURATION
SOURCE_TYPES
SOURCES
URL
ACTIVE_URLS
ACTIVE_SOURCES_INFO
TITLE_RAW
BODY
MODELED_EXPIRED
MODELED_DURATION
COMPANY
COMPANY_NAME
COMPANY_RAW
COMPANY_IS_STAFFING
EDUCATION_LEVELS
EDUCATION_LEVELS_NAME
MIN_EDULEVELS
MIN_EDULEVELS_NAME
MAX_EDULEVELS
MAX_EDULEVELS_NAME
EMPLOYMENT_TYPE
EMPLOYMENT_TYPE_NAME
MIN_YEARS_EXPERIENCE
MAX_YEARS_EXPERIENCE
IS_INTERNSHIP
SALARY
REMOTE_TYPE
REMOTE_TYPE_NAME
ORIGINAL_PAY_PERIOD
SALARY_TO
SALARY_FROM
LOCATION
CITY
CITY_NAME
COUNTY
COUNTY_NAME
MSA
MSA_NAME
STATE
STATE_NAME
COUNTY_OUTGOING
COUNTY_NAME_OUTGOING
COUNTY_INCOMING
COUNTY_NAME_INCOMING
MSA_OUTGOING
MSA_NAME_OUTGOING
MSA_INCOMING
MSA_NAME_INCOMING
NAICS2
NAICS2_NAME
NAICS3
NAICS3_NAME
NAICS4
NAICS4_NAME
NAICS5
NAICS5_NAME
NAICS6
NAICS6_NAME
TITLE
TITLE_NAME
TITLE_CLEAN
SKILLS
SKILLS_NAME
SPECIALIZED_SKILLS
SPECIALIZED_SKILLS_NAME
CERTIFICATIONS
CERTIFICATIONS_NAME
COMMON_SKILLS
COMMON_SKILLS_NAME
SOFTWARE_SKILLS
SOFTWARE_SKILLS_NAME
ONET
ONET_NAME
ONET_2019
ONET_2019_NAME
CIP6
CIP6_NAME
CIP4
CIP4_NAME
CIP2
CIP2_NAME
SOC_2021_2
SOC_2021_2_NAME
SOC_2021_3
SOC_2021_3_NAME
SOC_2021_4
SOC_2021_4_NAME
SOC_2021_5
SOC_2021_5_NAME
LOT_CAREER_AREA
LOT_CAREER_AREA_NAME
LOT_OCCUPATION
LOT_OCCUPATION_NAME
LOT_SPECIALIZED_OCCUPATION
LOT_SPECIALIZED_OCCUPATION_NAME
LOT_OCCUPATION_GROUP
LOT_OCCUPATION_GROUP_NAME
LOT_V6_SPECIALIZED_OCCUPATION
LOT_V6_SPECIALIZED_OCCUPATION_NAME
LOT_V6_OCCUPATION
LOT_V6_OCCUPATION_NAME
LOT_V6_OCCUPATION_GROUP
LOT_V6_OCCUPATION_GROUP_NAME
LOT_V6_CAREER_AREA
LOT_V6_CAREER_AREA_NAME
SOC_2
SOC_2_NAME
SOC_3
SOC_3_NAME
SOC_4
SOC_4_NAME
SOC_5
SOC_5_NAME
LIGHTCAST_SECTORS
LIGHTCAST_SECTORS_NAME
NAICS_2022_2
NAICS_2022_2_NAME
NAICS_2022_3
NAICS_2022_3_NAME
NAICS_2022_4
NAICS_2022_4_NAME
NAICS_2022_5
NAICS_2022_5_NAME
NAICS_2022_6
NAICS_2022_6_NAME

filter for those with salary info

In [6]:
Jobs_w_salary_info = Jobs.filter(
    ~(
        Jobs["SALARY"].is_null() |
        Jobs["SALARY_TO"].is_null() |
        Jobs["SALARY_FROM"].is_null()
    )
)
In [7]:
Jobs_w_salary_info.select(["SALARY_FROM", "SALARY_TO", "SALARY"]).describe()
Out[7]:
shape: (9, 4)
statisticSALARY_FROMSALARY_TOSALARY
strf64f64f64
"count"30808.030808.030808.0
"null_count"0.00.00.0
"mean"96108.560277139349.489938117953.755031
"std"38651.51593557867.60604645133.878359
"min"10230.016640.015860.0
"25%"66560.096471.084933.0
"50%"90000.0135000.0116300.0
"75%"119800.0178200.0145600.0
"max"500000.0500000.0500000.0
In [8]:
Jobs_w_salary_info.shape
Out[8]:
(30808, 131)
In [9]:
Jobs_w_salary_info.select(
    pl.col("EMPLOYMENT_TYPE_NAME").value_counts()
)
Out[9]:
shape: (3, 1)
EMPLOYMENT_TYPE_NAME
struct[2]
{"Part-time (≤ 32 hours)",1038}
{"Full-time (> 32 hours)",29151}
{"Part-time / full-time",619}
In [10]:
Jobs_w_salary_info.select(
    pl.col("EMPLOYMENT_TYPE").value_counts()
)
Out[10]:
shape: (3, 1)
EMPLOYMENT_TYPE
struct[2]
{1,29151}
{2,1038}
{3,619}
In [11]:
Jobs_w_salary_info.group_by("EMPLOYMENT_TYPE_NAME").agg([
    pl.len().alias("Job_Count"),
    pl.col("SALARY").mean().alias("Avg_Salary"),
    pl.col("SALARY").median().alias("Median_Salary"),
    pl.col("SALARY").min().alias("Min_Salary"),
    pl.col("SALARY").max().alias("Max_Salary"),
    pl.col("SALARY").std().alias("Salary_StdDev")
]).sort("Avg_Salary", descending=True)
Out[11]:
shape: (3, 7)
EMPLOYMENT_TYPE_NAMEJob_CountAvg_SalaryMedian_SalaryMin_SalaryMax_SalarySalary_StdDev
stru32f64f64i64i64f64
"Full-time (> 32 hours)"29151118897.558609116500.02058350000044351.533443
"Part-time / full-time"619105621.242326100000.02080045537552979.422642
"Part-time (≤ 32 hours)"103898802.50963486390.01586031005055382.720356
In [12]:
df_plot = Jobs_w_salary_info.select(["EMPLOYMENT_TYPE_NAME", "SALARY_FROM"]).to_pandas()

employment_types = df_plot["EMPLOYMENT_TYPE_NAME"].unique()

plot_01 = go.Figure()

for emp_type in employment_types:
    plot_01.add_trace(
        go.Box(
            y=df_plot[df_plot["EMPLOYMENT_TYPE_NAME"] == emp_type]["SALARY_FROM"],
            name=emp_type,
            boxpoints='outliers',
            marker_color='lightgray',
            line_color='black',
            showlegend=False
        )
    )

plot_01.add_trace(
    go.Scatter(
        x=df_plot["EMPLOYMENT_TYPE_NAME"],
        y=df_plot["SALARY_FROM"],
        mode='markers',
        marker=dict(
            color=df_plot["SALARY_FROM"],
            colorscale='Viridis',
            showscale=True,
            size=6,
            opacity=0.7,
            colorbar=dict(title="Starting Salary")
        ),
        name='Salary Points',
        hoverinfo='x+y'
    )
)

plot_01.update_layout(
    title="Starting Salary Distribution by Employment Type",
    xaxis_title="Employment Type",
    yaxis_title="Starting Salary",
    xaxis_tickangle=-45,
    template='plotly_white',
    yaxis=dict(
        tick0=0,
        dtick=50000,
        gridcolor="lightgray"
    ),
    margin=dict(t=80, b=100, l=80, r=40),
    boxmode='group',
    boxgap=0.4,
    height=600,
    width=1000
)


plot_01.show()
In [13]:
df_plot = Jobs_w_salary_info.select(["NAICS2_NAME", "SALARY_FROM"]).to_pandas()

industries = df_plot["NAICS2_NAME"].unique()

plot_02 = go.Figure()

for industry in industries:
    plot_02.add_trace(
        go.Box(
            y=df_plot[df_plot["NAICS2_NAME"] == industry]["SALARY_FROM"],
            name=industry,
            boxpoints='outliers',
            marker_color='lightgray',
            line_color='black',
            showlegend=False
        )
    )

plot_02.add_trace(
    go.Scatter(
        x=df_plot["NAICS2_NAME"],
        y=df_plot["SALARY_FROM"],
        mode='markers',
        marker=dict(
            color=df_plot["SALARY_FROM"],
            colorscale='Turbo', 
            showscale=True,
            size=6,
            opacity=0.7,
            colorbar=dict(title="Starting Salary")
        ),
        name='Salary Points',
        hoverinfo='x+y'
    )
)

plot_02.update_layout(
    title="Starting Salary Distribution by Industry (NAICS2_NAME)",
    xaxis_title="Industry (NAICS2)",
    yaxis_title="Starting Salary",
    xaxis_tickangle=-45,
    template='plotly_white',
    yaxis=dict(
        tick0=0,
        dtick=50000,       
        gridcolor="lightgray"
    ),
    margin=dict(
        t=80,  
        b=120, 
        l=80,
        r=40
    ),
    boxmode='group',
    boxgap=0.4,  
    height=700,  
    width=1200   
)

plot_02.show()
In [14]:
Jobs_with_dates = Jobs.with_columns(
    pl.col("POSTED").cast(pl.Utf8).str.strptime(pl.Date, "%m/%d/%Y", strict=False).alias("POSTED_DATE")
)

daily_postings = (
    Jobs_with_dates
    .group_by("POSTED_DATE")
    .agg(pl.len().alias("Post_Count"))
    .sort("POSTED_DATE")
)

df_line = daily_postings.to_pandas()

df_line["Smoothed"] = df_line["Post_Count"].rolling(window=7, center=True).mean()
In [15]:
plot_03 = go.Figure()

# Raw daily line (gray)
plot_03.add_trace(go.Scatter(
    x=df_line["POSTED_DATE"],
    y=df_line["Post_Count"],
    mode="lines",
    name="Raw Daily Count",
    line=dict(color="lightgray", width=2)
))

# Smoothed line (red)
plot_03.add_trace(go.Scatter(
    x=df_line["POSTED_DATE"],
    y=df_line["Smoothed"],
    mode="lines",
    name="7-Day Smoothed",
    line=dict(color="red", width=3)
))

# Layout and formatting
plot_03.update_layout(
    title="Job Postings Over Time (Raw + 7-Day Smoothed)",
    xaxis_title="Date",
    yaxis_title="Number of Postings",
    template="plotly_white",
    height=500,
    width=1000,
    xaxis_tickangle=-45,
    margin=dict(t=60, b=100, l=60, r=40),
    legend=dict(x=0.01, y=0.99, bordercolor="gray", borderwidth=1)
)

plot_03.show()
In [16]:
Jobs_with_dates.select([
    pl.col("POSTED_DATE").min().alias("Earliest"),
    pl.col("POSTED_DATE").max().alias("Latest")
])
Out[16]:
shape: (1, 2)
EarliestLatest
datedate
2024-05-012024-09-30
In [17]:
top_titles = (
    Jobs.group_by("TITLE_NAME")
        .agg(pl.len().alias("Count"))
        .sort("Count", descending=True)
        .limit(10)
)
In [18]:
df_top_titles = top_titles.to_pandas()
In [19]:
plot_04 = px.bar(
    df_top_titles,
    x="TITLE_NAME",
    y="Count",
    color="TITLE_NAME",  # Different color per title
    title="Top 10 Job Titles by Number of Postings",
    labels={"TITLE_NAME": "Job Title", "Count": "Number of Postings"},
    template="plotly_white"
)

plot_04.update_layout(
    xaxis_tickangle=-45,
    height=500,
    width=1000,
    showlegend=False,  # Optional: hide legend since x-axis already shows titles
    margin=dict(t=60, b=100, l=60, r=40)
)

plot_04.show()
In [20]:
Jobs.select("REMOTE_TYPE_NAME").unique()
Out[20]:
shape: (5, 1)
REMOTE_TYPE_NAME
str
"Not Remote"
"Hybrid Remote"
null
"Remote"
"[None]"
In [21]:
Jobs_cleaned_remote = Jobs.with_columns(
    pl.when(
        pl.col("REMOTE_TYPE_NAME").is_null() | 
        (pl.col("REMOTE_TYPE_NAME").cast(pl.Utf8).str.strip_chars("[]") == "None")
    )
    .then(pl.lit("Unknown"))
    .otherwise(pl.col("REMOTE_TYPE_NAME"))
    .alias("REMOTE_TYPE_NAME")
)
In [22]:
Jobs_cleaned_remote.select("REMOTE_TYPE_NAME").unique()
Out[22]:
shape: (4, 1)
REMOTE_TYPE_NAME
str
"Not Remote"
"Remote"
"Unknown"
"Hybrid Remote"
In [23]:
remote_counts = (
    Jobs_cleaned_remote
    .group_by("REMOTE_TYPE_NAME")
    .agg(pl.len().alias("Count"))
    .sort("Count", descending=True)
)
In [24]:
df_remote = remote_counts.to_pandas()

plot_05 = px.pie(
    df_remote,
    names="REMOTE_TYPE_NAME",
    values="Count",
    title="Distribution of Remote Work Types",
    color_discrete_sequence=px.colors.qualitative.Set3
)

plot_05.update_traces(textposition='inside', textinfo='percent+label')

plot_05.update_layout(
    template="plotly_white",
    margin=dict(t=60, b=60, l=60, r=60)
)

plot_05.show()
In [25]:
Jobs.select("SKILLS_NAME").unique().shape
Out[25]:
(44173, 1)
In [26]:
Jobs.select("SKILLS_NAME").unique().head(20)
Out[26]:
shape: (20, 1)
SKILLS_NAME
str
"[   "Communication",   "Integr…
"[   "Databricks",   "Curiosity…
"[   "Management",   "Metadata …
"[   "Business Objectives",   "…
"[   "Detail Oriented",   "Ware…
…
"[   "Relational Databases",   …
"[   "Azure DevOps",   "Researc…
"[   "Information Technology", …
"[   "Object-Oriented Programmi…
"[   "Merchandising",   "Market…
In [27]:
exploded = Jobs.with_columns(
    pl.col("SKILLS_NAME").cast(pl.List(pl.Utf8))
).explode("SKILLS_NAME")
In [28]:
skills_per_industry = (
    exploded
    .group_by("NAICS_2022_6_NAME")
    .agg(pl.len().alias("Skill_Count"))
    .sort("Skill_Count", descending=True)
    .limit(15)
)
In [29]:
df_industry_skills = skills_per_industry.to_pandas()
In [30]:
plot_06 = px.bar(
    df_industry_skills,
    x="NAICS_2022_6_NAME",
    y="Skill_Count",
    title="Top 15 Industries by Skill Mentions",
    labels={
        "NAICS_2022_6_NAME": "Industry",
        "Skill_Count": "Total Skill Mentions"
    },
    template="plotly_white",
    color="NAICS_2022_6_NAME", 
    color_discrete_sequence=px.colors.qualitative.Pastel 
)

plot_06.update_layout(
    xaxis_tickangle=-45,
    height=700,
    width=1200,
    margin=dict(t=60, b=140, l=60, r=40),
    showlegend=False
)

plot_06.show()
In [31]:
Jobs.select("ONET_NAME").unique()
Out[31]:
shape: (2, 1)
ONET_NAME
str
null
"Business Intelligence Analysts"
In [32]:
Jobs_w_salary_info.select("ONET_NAME").unique()
Out[32]:
shape: (1, 1)
ONET_NAME
str
"Business Intelligence Analysts"
In [33]:
Jobs_w_salary_info.filter(
    pl.col("ONET_NAME") == "Business Intelligence Analysts"
).select(
    pl.median("SALARY").alias("Median_Salary")
)
Out[33]:
shape: (1, 1)
Median_Salary
f64
116300.0
In [34]:
plot_07 = px.bar(
    x=["Business Intelligence Analysts"],
    y=[Jobs_w_salary_info.filter(pl.col("ONET_NAME") == "Business Intelligence Analysts")
       .select(pl.median("SALARY"))[0, 0]],
    labels={"x": "[ONET] Occupation", "y": "Median Salary"},
    title="Median Salary: Business Intelligence Analysts",
    template="plotly_white"
)

plot_07.update_traces(marker_color='teal')

plot_07.show()
In [42]:
Jobs.select("SOC_2021_2_NAME").unique()
Out[42]:
shape: (2, 1)
SOC_2021_2_NAME
str
"Computer and Mathematical Occu…
null
In [48]:
Jobs.group_by("SOC_2021_2_NAME").agg(
    pl.len().alias("Post_Count")
).sort("Post_Count", descending=True)
Out[48]:
shape: (2, 2)
SOC_2021_2_NAMEPost_Count
stru32
"Computer and Mathematical Occu…72454
null44
In [36]:
Jobs.select("SOC_2021_3_NAME").unique()
Out[36]:
shape: (2, 1)
SOC_2021_3_NAME
str
"Mathematical Science Occupatio…
null
In [49]:
Jobs.group_by("SOC_2021_3_NAME").agg(
    pl.len().alias("Post_Count")
).sort("Post_Count", descending=True)
Out[49]:
shape: (2, 2)
SOC_2021_3_NAMEPost_Count
stru32
"Mathematical Science Occupatio…72454
null44
In [37]:
transitions = (
    Jobs.group_by(["SOC_2021_2_NAME", "SOC_2021_3_NAME"])
    .agg(pl.len().alias("Count"))
    .filter(pl.col("SOC_2021_2_NAME").is_not_null() & pl.col("SOC_2021_3_NAME").is_not_null())
)
In [38]:
df_transitions = transitions.to_pandas()
In [39]:
all_labels = pd.unique(
    np.concatenate([
        df_transitions["SOC_2021_2_NAME"].values,
        df_transitions["SOC_2021_3_NAME"].values
    ])
)

label_to_index = {label: idx for idx, label in enumerate(all_labels)}

df_transitions["source"] = df_transitions["SOC_2021_2_NAME"].map(label_to_index)
df_transitions["target"] = df_transitions["SOC_2021_3_NAME"].map(label_to_index)
In [40]:
plot_08 = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=list(label_to_index.keys()),
        color="lightblue"
    ),
    link=dict(
        source=df_transitions["source"],
        target=df_transitions["target"],
        value=df_transitions["Count"]
    )
)])

plot_08.update_layout(
    title_text="SOC Transitions: 2-Digit → 3-Digit Level",
    font_size=12,
    height=700,
    width=1000
)

plot_08.show()
In [ ]: